Two Sigma: Using News to Predict Stock Movementsยถ

First Time Load environment can't load againยถ

Code
In [1]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
# from plotly.tools import FigureFactory as FF 
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
Code
In [2]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')
Loading the data... This could take a minute.
Done!
Done!
Code
In [3]:
(market_train_df, news_train_df) = env.get_training_data()
In [4]:
market_train, news_train = market_train_df.copy(), news_train_df.copy()
Code
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
%matplotlib inline
import seaborn as sns
import numpy as np
import plotly.figure_factory as ff


######### Function
def mis_value_graph(data):
#     data.isnull().sum().plot(kind="bar", figsize = (20,10), fontsize = 20)
#     plt.xlabel("Columns", fontsize = 20)
#     plt.ylabel("Value Count", fontsize = 20)
#     plt.title("Total Missing Value By Column", fontsize = 20)
#     for i in range(len(data)):
#          colors.append(generate_color())
            
    data = [
    go.Bar(
        x = data.columns,
        y = data.isnull().sum(),
        name = 'Unknown Assets',
        textfont=dict(size=20),
        marker=dict(
#         color= colors,
        line=dict(
            color='#000000',
            width=2,
        ), opacity = 0.45
    )
    ),
    ]
    layout= go.Layout(
        title= '"Total Missing Value By Column"',
        xaxis= dict(title='Columns', ticklen=5, zeroline=False, gridwidth=2),
        yaxis=dict(title='Value Count', ticklen=5, gridwidth=2),
        showlegend=True
    )
    fig= go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='skin')
    

def mis_impute(data):
    for i in data.columns:
        if data[i].dtype == "object":
            data[i] = data[i].fillna("other")
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data


import random

def generate_color():
    color = '#{:02x}{:02x}{:02x}'.format(*map(lambda x: random.randint(0, 255), range(3)))
    return color

1. market_train_df Data Investigationยถ

Code
In [6]:
mis_value_graph(market_train_df)
market_train_df = mis_impute(market_train_df)
market_train_df.isna().sum().to_frame()
timeassetCodeassetNamevolumecloseopenreturnsClosePrevRaw1returnsOpenPrevRaw1returnsClosePrevMktres1returnsOpenPrevMktres1returnsClosePrevRaw10returnsOpenPrevRaw10returnsClosePrevMktres10returnsOpenPrevMktres10returnsOpenNextMktres10universe020k40k60k80kExport to plot.ly ยป
Unknown Assets"Total Missing Value By Column"ColumnsValue Count
Out[6]:
0
time 0
assetCode 0
assetName 0
volume 0
close 0
open 0
returnsClosePrevRaw1 0
returnsOpenPrevRaw1 0
returnsClosePrevMktres1 0
returnsOpenPrevMktres1 0
returnsClosePrevRaw10 0
returnsOpenPrevRaw10 0
returnsClosePrevMktres10 0
returnsOpenPrevMktres10 0
returnsOpenNextMktres10 0
universe 0

1.1 Top-10 Largest Assets code by Close valueยถ

Code
In [7]:
# https://www.kaggle.com/pestipeti/simple-eda-two-sigma
Code
In [8]:
best_asset_volume = market_train_df.groupby("assetCode")["close"].count().to_frame().sort_values(by=['close'],ascending= False)
best_asset_volume = best_asset_volume.sort_values(by=['close'])
largest_by_volume = list(best_asset_volume.nlargest(10, ['close']).index)
# largest_by_volume
Code
In [9]:
for i in largest_by_volume:
    asset1_df = market_train_df[(market_train_df['assetCode'] == i) & (market_train_df['time'] > '2015-01-01') & (market_train_df['time'] < '2017-01-01')]
    # Create a trace
    trace1 = go.Scatter(
        x = asset1_df['time'].dt.strftime(date_format='%Y-%m-%d').values,
        y = asset1_df['close'].values,
        line = dict(color = generate_color()),opacity = 0.8
    )

    layout = dict(title = "Closing prices of {}".format(i),
                  xaxis = dict(title = 'Month'),
                  yaxis = dict(title = 'Price (USD)'),
                  )

    data = [trace1]
    py.iplot(dict(data=data, layout=layout), filename='basic-line')
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016657075808590Export to plot.ly ยป
Closing prices of CAH.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163436384042444648Export to plot.ly ยป
Closing prices of CAG.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016100110120130140150160Export to plot.ly ยป
Closing prices of UNH.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20164042444648Export to plot.ly ยป
Closing prices of UL.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 201681012141618Export to plot.ly ยป
Closing prices of CCJ.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 201681012141618202224Export to plot.ly ยป
Closing prices of TSU.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016404244464850525456Export to plot.ly ยป
Closing prices of TOT.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 2016100110120130140Export to plot.ly ยป
Closing prices of UHS.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20164045505560657075Export to plot.ly ยป
Closing prices of TSN.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Closing prices of CMS.NMonthPrice (USD)

1.2 Open and Close value of Top 10 Asset Codeยถ

Code
In [10]:
for i in largest_by_volume:

    asset1_df['high'] = asset1_df['open']
    asset1_df['low'] = asset1_df['close']

    for ind, row in asset1_df.iterrows():
        if row['close'] > row['open']:
            
            asset1_df.loc[ind, 'high'] = row['close']
            asset1_df.loc[ind, 'low'] = row['open']

    trace1 = go.Candlestick(
        x = asset1_df['time'].dt.strftime(date_format='%Y-%m-%d').values,
        open = asset1_df['open'].values,
        low = asset1_df['low'].values,
        high = asset1_df['high'].values,
        close = asset1_df['close'].values,
        increasing=dict(line=dict(color= generate_color())),
        decreasing=dict(line=dict(color= generate_color())))

    layout = dict(title = "Candlestick chart for {}".format(i),
                  xaxis = dict(
                      title = 'Month',
                      rangeslider = dict(visible = False)
                  ),
                  yaxis = dict(title = 'Price (USD)')
                 )
    data = [trace1]

    py.iplot(dict(data=data, layout=layout), filename='basic-line')
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for CAH.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for CAG.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for UNH.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for UL.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for CCJ.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for TSU.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for TOT.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for UHS.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for TSN.NMonthPrice (USD)
Apr 2015Jul 2015Oct 2015Jan 2016Apr 2016Jul 2016Oct 20163234363840424446Export to plot.ly ยป
Candlestick chart for CMS.NMonthPrice (USD)

1.3 Assets By Trading Daysยถ

Code
In [11]:
assetsByTradingDay = market_train_df.groupby(market_train_df['time'].dt.date)['assetCode'].nunique()
# Create a trace
trace1 = go.Bar(
    x = assetsByTradingDay.index, # asset1_df['time'].dt.strftime(date_format='%Y-%m-%d').values,
    y = assetsByTradingDay.values, 
    marker=dict(
        color= generate_color(),
        line=dict(
            color=generate_color(),
            width=1.5,
        ), opacity = 0.8
    )
)

layout = dict(title = "Assets by trading days",
              xaxis = dict(title = 'Year'),
              yaxis = dict(title = 'Assets'))
data = [trace1]

py.iplot(dict(data=data, layout=layout), filename='basic-line')
20082010201220142016050010001500Export to plot.ly ยป
Assets by trading daysYearAssets

1.4 Asset Code Analysisยถ

Code
In [12]:
for i in range(1,100,10):
    volumeByAssets = market_train_df.groupby(market_train_df['assetCode'])['volume'].sum()
    highestVolumes = volumeByAssets.sort_values(ascending=False)[i:i+9]
    # Create a trace
    colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']
    trace1 = go.Pie(
        labels = highestVolumes.index,
        values = highestVolumes.values,
        textfont=dict(size=20),
        marker=dict(colors=colors,line=dict(color='#000000', width=2)), hole = 0.45)
    layout = dict(title = "Highest trading volumes for range of {} to {}".format(i, i+9))
    data = [trace1]
    py.iplot(dict(data=data, layout=layout), filename='basic-line')
14.6%13.5%13.4%12.5%11.6%10.3%8.76%8%7.38%Export to plot.ly ยป
GE.NF.NMSFT.OINTC.OCSCO.OPFE.NWFC.NJPM.NAAPL.OHighest trading volumes for range of 1 to 10
12.8%11.6%11.6%11.6%11.5%10.6%10.5%10%9.93%Export to plot.ly ยป
NOK.NORCL.OMU.OEMC.NYHOO.OPBR.NXOM.NFCX.NAMD.NHighest trading volumes for range of 11 to 20
11.9%11.9%11.4%11.3%11.1%11.1%10.8%10.3%10.2%Export to plot.ly ยป
FB.OMS.NCHK.NAMAT.OVALE.NCMCSA.ODELL.OVZ.NC.NHighest trading volumes for range of 21 to 30
11.9%11.5%11.4%11.2%11%11%10.7%10.7%10.6%Export to plot.ly ยป
BSX.NHAL.NNVDA.OGLW.NRAD.NEBAY.OLVS.NTSM.NMGM.NHighest trading volumes for range of 31 to 40
11.5%11.4%11.4%11.3%11.2%11%11%10.6%10.5%Export to plot.ly ยป
WMT.NALU.NKEY.NSIRI.OUSB.NFITB.OKO.NHD.NMO.NHighest trading volumes for range of 41 to 50
11.4%11.3%11.2%11.2%11.1%11%11%11%10.8%Export to plot.ly ยป
CX.NPG.NBMY.NNLY.NJNJ.NAUY.NJCP.NLOW.NMRVL.OHighest trading volumes for range of 51 to 60
11.7%11.5%11.2%11.2%11.2%11%10.8%10.8%10.7%Export to plot.ly ยป
DIS.NSYMC.OQ.NCOP.NGILD.ODOW.NBBD.NSPLS.OMRK.NHighest trading volumes for range of 61 to 70
11.4%11.4%11.4%11.3%11%10.9%10.9%10.9%10.8%Export to plot.ly ยป
ZNGA.OITUB.NKGC.NCVX.NGNW.NSBUX.OSLB.NABX.NAIG.NHighest trading volumes for range of 71 to 80
11.4%11.3%11.3%11.3%11.3%11.1%10.8%10.8%10.7%Export to plot.ly ยป
MRO.NAXP.NCVS.NBRCD.OGM.NBRCM.OATVI.OBP.NLUV.NHighest trading volumes for range of 81 to 90
11.5%11.4%11.2%11.1%11.1%11.1%11%10.9%10.8%Export to plot.ly ยป
PHM.NNEM.NWM.NGS.NPBRa.NBK.NMET.NWMB.NABT.NHighest trading volumes for range of 91 to 100

1.5 Unknown Value By Assets Codeยถ

Code
In [13]:
assetNameGB = market_train_df[market_train_df['assetName'] == 'Unknown'].groupby('assetCode')
unknownAssets = assetNameGB.size().reset_index('assetCode')
unknownAssets.columns = ['assetCode',"value"]
unknownAssets = unknownAssets.sort_values("value", ascending= False)
unknownAssets.head(5)

colors = []
for i in range(len(unknownAssets)):
     colors.append(generate_color())

        
data = [
    go.Bar(
        x = unknownAssets.assetCode.head(25),
        y = unknownAssets.value.head(25),
        name = 'Unknown Assets',
        textfont=dict(size=20),
        marker=dict(
        color= colors,
        line=dict(
            color='#000000',
            width=2,
        ), opacity = 0.45
    )
    ),
    ]
layout= go.Layout(
    title= 'Unknown Assets by Asset code',
    xaxis= dict(title='Columns', ticklen=5, zeroline=False, gridwidth=2),
    yaxis=dict(title='Value Count', ticklen=5, gridwidth=2),
    showlegend=True
)
fig= go.Figure(data=data, layout=layout)
py.iplot(fig, filename='skin')
TEVA.NOGZP.OTRQ.NCMVT.OQINFY.NBSMX.NGRFS.OSBGL.NFOX.OAUQ.NABEV.NDTEGY.OBEPIC.OTRH.NFCL.NACOM.OGWPH.OMXB.NLMCK.OHLTOY.ODDAIF.OBCPG.NPBA.NEURN.NBRGYY.OB020040060080010001200Export to plot.ly ยป
Unknown AssetsUnknown Assets by Asset codeColumnsValue Count

2.news_train_df Data Investigationยถ

Code
In [14]:
mis_value_graph(news_train_df)
news_train_df = mis_impute(news_train_df)
news_train_df.isna().sum().to_frame()
timesourceTimestampfirstCreatedsourceIdheadlineurgencytakeSequenceprovidersubjectsaudiencesbodySizecompanyCountheadlineTagmarketCommentarysentenceCountwordCountassetCodesassetNamefirstMentionSentencerelevancesentimentClasssentimentNegativesentimentNeutralsentimentPositivesentimentWordCountnoveltyCount12HnoveltyCount24HnoveltyCount3DnoveltyCount5DnoveltyCount7DvolumeCounts12HvolumeCounts24HvolumeCounts3DvolumeCounts5DvolumeCounts7Dโˆ’1โˆ’0.500.51Export to plot.ly ยป
Unknown Assets"Total Missing Value By Column"ColumnsValue Count
Out[14]:
0
time 0
sourceTimestamp 0
firstCreated 0
sourceId 0
headline 0
urgency 0
takeSequence 0
provider 0
subjects 0
audiences 0
bodySize 0
companyCount 0
headlineTag 0
marketCommentary 0
sentenceCount 0
wordCount 0
assetCodes 0
assetName 0
firstMentionSentence 0
relevance 0
sentimentClass 0
sentimentNegative 0
sentimentNeutral 0
sentimentPositive 0
sentimentWordCount 0
noveltyCount12H 0
noveltyCount24H 0
noveltyCount3D 0
noveltyCount5D 0
noveltyCount7D 0
volumeCounts12H 0
volumeCounts24H 0
volumeCounts3D 0
volumeCounts5D 0
volumeCounts7D 0
Code
In [15]:
print("News data shape",news_train_df.shape)
news_train_df.head()
News data shape (9328750, 35)
Out[15]:
time sourceTimestamp firstCreated sourceId headline urgency takeSequence provider subjects audiences bodySize companyCount headlineTag marketCommentary sentenceCount wordCount assetCodes assetName firstMentionSentence relevance sentimentClass sentimentNegative sentimentNeutral sentimentPositive sentimentWordCount noveltyCount12H noveltyCount24H noveltyCount3D noveltyCount5D noveltyCount7D volumeCounts12H volumeCounts24H volumeCounts3D volumeCounts5D volumeCounts7D
0 2007-01-01 04:29:32+00:00 2007-01-01 04:29:32+00:00 2007-01-01 04:29:32+00:00 e58c6279551b85cf China's Daqing pumps 43.41 mln tonnes of oil i... 3 1 RTRS {'ENR', 'ASIA', 'CN', 'NGS', 'EMRG', 'RTRS', '... {'Z', 'O', 'OIL'} 1438 1 False 11 275 {'0857.HK', '0857.F', '0857.DE', 'PTR.N'} PetroChina Co Ltd 6 0.235702 -1 0.500739 0.419327 0.079934 73 0 0 0 0 0 0 0 3 6 7
1 2007-01-01 07:03:35+00:00 2007-01-01 07:03:34+00:00 2007-01-01 07:03:34+00:00 5a31c4327427f63f FEATURE-In kidnapping, finesse works best 3 1 RTRS {'FEA', 'CA', 'LATAM', 'MX', 'INS', 'ASIA', 'I... {'PGE', 'PCO', 'G', 'ESN', 'MD', 'PCU', 'DNP',... 4413 1 FEATURE False 55 907 {'STA.N'} Travelers Companies Inc 8 0.447214 -1 0.600082 0.345853 0.054064 62 1 1 1 1 1 1 1 3 3 3
2 2007-01-01 11:29:56+00:00 2007-01-01 11:29:56+00:00 2007-01-01 11:29:56+00:00 1cefd27a40fabdfe PRESS DIGEST - Wall Street Journal - Jan 1 3 1 RTRS {'RET', 'ENR', 'ID', 'BG', 'US', 'PRESS', 'IQ'... {'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD... 2108 2 PRESS DIGEST False 15 388 {'WMT.DE', 'WMT.N'} Wal-Mart Stores Inc 14 0.377964 -1 0.450049 0.295671 0.254280 67 0 0 0 0 0 0 0 5 11 17
3 2007-01-01 12:08:37+00:00 2007-01-01 12:08:37+00:00 2007-01-01 12:08:37+00:00 23768af19dc69992 PRESS DIGEST - New York Times - Jan 1 3 1 RTRS {'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B... {'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD... 1776 6 PRESS DIGEST False 14 325 {'GOOG.O', 'GOOG.OQ', 'GOOGa.DE'} Google Inc 13 0.149071 -1 0.752917 0.162715 0.084368 83 0 0 0 0 0 0 0 5 13 15
4 2007-01-01 12:08:37+00:00 2007-01-01 12:08:37+00:00 2007-01-01 12:08:37+00:00 23768af19dc69992 PRESS DIGEST - New York Times - Jan 1 3 1 RTRS {'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B... {'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD... 1776 6 PRESS DIGEST False 14 325 {'XMSR.O'} XM Satellite Radio Holdings Inc 11 0.149071 -1 0.699274 0.209360 0.091366 102 0 0 0 0 0 0 0 0 0 0

2.1 Sentiment Count By Asset code or Urgencyยถ

Code
In [16]:
# news_train_df['urgency'].value_counts()
news_sentiment_count = news_train_df.groupby(["urgency","assetName"])[["sentimentNegative","sentimentNeutral","sentimentPositive"]].count()
news_sentiment_count = news_sentiment_count.reset_index()
Code
In [17]:
trace = go.Table(
    header=dict(values=list(news_sentiment_count.columns),
                fill = dict(color='rgba(55, 128, 191, 0.7)'),
                align = ['left'] * 5),
    cells=dict(values=[news_sentiment_count.urgency,news_sentiment_count.assetName,news_sentiment_count["sentimentNegative"], news_sentiment_count["sentimentPositive"], news_sentiment_count["sentimentNeutral"]],
               fill = dict(color='rgba(245, 246, 249, 1)'),
               align = ['left'] * 5))

data = [trace] 
py.iplot(data, filename = 'pandas_table')
1111111111111111111111111111111111urgency1 800 CONTACTSInc1-800-Flowers.ComInc1347 PropertyInsuranceHoldings Inc180 DegreeCapital Corp1st CenturyBancshares Inc1st ConstitutionBancorp1st Source Corp1st UnitedBancorp Inc21Vianet GroupInc21st CenturyInsurance Group21st CenturyOncology Inc21st North Inc22nd CenturyGroup Inc24/7 Real MediaInc2U Inc3Com Corp3D Systems Corp3M Co3M Cogent Inc3Par Inc3SBio Inc4 KidsEntertainment Inc500.Com Ltd51job Inc58.com Inc6D GlobalTechnologies Inc7 Days GroupHoldings Ltd8Point3 EnergyPartners LP8x8 Inc99 Cents OnlyStores@Road IncA G Edwards IncA Schulman IncA-Mark PreciousMetals IncassetName273468615893158148115453184512483331631667732271135186120572653412356212513031429213480683sentimentNegative273468615893158148115453184512483331631667732271135186120572653412356212513031429213480683sentimentNeutral273468615893158148115453184512483331631667732271135186120572653412356212513031429213480683sentimentPositiveExport to plot.ly ยป
Code
In [18]:
trace0 = go.Bar(
    x= news_sentiment_count.assetName.head(30),
    y=news_sentiment_count.sentimentNegative.values,
    name='sentimentNegative',
    textfont=dict(size=20),
        marker=dict(
        color= generate_color(),
        opacity = 0.87
    )
)
trace1 = go.Bar(
    x= news_sentiment_count.assetName.head(30),
    y=news_sentiment_count.sentimentNeutral.values,
    name='sentimentNeutral',
    textfont=dict(size=20),
        marker=dict(
        color= generate_color(),
        opacity = 0.87
    )
)
trace2 = go.Bar(
    x= news_sentiment_count.assetName.head(30),
    y=news_sentiment_count.sentimentPositive.values,
    name='sentimentPositive',
    textfont=dict(size=20),
    marker=dict(
        color= generate_color(),
        opacity = 0.87
    )
)

data = [trace0, trace1, trace2]
layout = go.Layout(
    xaxis=dict(tickangle=-45),
    barmode='group',
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='angled-text-bar')
1 800 CONTACTS Inc1-800-Flowers.Com Inc1347 Property Insurance Holdings Inc180 Degree Capital Corp1st Century Bancshares Inc1st Constitution Bancorp1st Source Corp1st United Bancorp Inc21Vianet Group Inc21st Century Insurance Group21st Century Oncology Inc21st North Inc22nd Century Group Inc24/7 Real Media Inc2U Inc3Com Corp3D Systems Corp3M Co3M Cogent Inc3Par Inc3SBio Inc4 Kids Entertainment Inc500.Com Ltd51job Inc58.com Inc6D Global Technologies Inc7 Days Group Holdings Ltd8Point3 Energy Partners LP8x8 Inc99 Cents Only Stores0500100015002000Export to plot.ly ยป
sentimentNegativesentimentNeutralsentimentPositive
Code
In [19]:
news_sentiment_urgency = news_train_df.groupby(["urgency"])[["sentimentNegative","sentimentNeutral","sentimentPositive"]].count()
news_sentiment_urgency = news_sentiment_urgency.reset_index()
Code
In [20]:
trace = go.Table(
    header=dict(values=list(news_sentiment_urgency.columns),
                fill = dict(color='rgba(55, 128, 191, 0.7)'),
                align = ['left'] * 5),
    cells=dict(values=[news_sentiment_urgency.urgency,news_sentiment_urgency["sentimentNegative"], news_sentiment_urgency["sentimentPositive"], news_sentiment_urgency["sentimentNeutral"]],
               fill = dict(color='rgba(245, 246, 249, 1)'),
               align = ['left'] * 5))

data = [trace] 
py.iplot(data, filename = 'pandas_table')
123urgency3166158256162567sentimentNegative3166158256162567sentimentNeutral3166158256162567sentimentPositiveExport to plot.ly ยป
Code
In [21]:
trace0 = go.Bar(
    x= news_sentiment_urgency.urgency.values,
    y=news_sentiment_urgency.sentimentNegative.values,
    name='sentimentNegative',
    textfont=dict(size=20),
        marker=dict(
        color= generate_color(),
            line=dict(
            color='#000000',
            width=2,
        ),
        opacity = 0.87
    )
)
trace1 = go.Bar(
    x= news_sentiment_urgency.urgency.values,
    y=news_sentiment_urgency.sentimentNegative.values,
    name='sentimentNeutral',
    textfont=dict(size=20),
        marker=dict(
        color= generate_color(),
        line=dict(
            color='#000000',
            width=2,
        ),
        opacity = 0.87
    )
)
trace2 = go.Bar(
    x= news_sentiment_urgency.urgency.values,
    y=news_sentiment_urgency.sentimentNegative.values,
    name='sentimentPositive',
    textfont=dict(size=20),
    marker=dict(
        line=dict(
            color='#000000',
            width=2,
        ),
        color= generate_color(),
        opacity = 0.87
    )
)
data = [trace0, trace1, trace2]
layout = go.Layout(
    xaxis=dict(tickangle=-45),
    barmode='group',
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='angled-text-bar')
12301M2M3M4M5M6MExport to plot.ly ยป
sentimentNegativesentimentNeutralsentimentPositive

3.Data Prepareยถ

Code
In [22]:
%%time
def data_prep(market_train,news_train):
    market_train.time = market_train.time.dt.date
    news_train.time = news_train.time.dt.hour
    news_train.sourceTimestamp= news_train.sourceTimestamp.dt.hour
    news_train.firstCreated = news_train.firstCreated.dt.date
    news_train['assetCodesLen'] = news_train['assetCodes'].map(lambda x: len(eval(x)))
    news_train['assetCodes'] = news_train['assetCodes'].map(lambda x: list(eval(x))[0])
    kcol = ['firstCreated', 'assetCodes']
    news_train = news_train.groupby(kcol, as_index=False).mean()
    market_train = pd.merge(market_train, news_train, how='left', left_on=['time', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])
    lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
    market_train['assetCodeT'] = market_train['assetCode'].map(lbl)
    
    
    market_train = market_train.dropna(axis=0)
    
    return market_train

market_train = data_prep(market_train_df, news_train_df)
market_train.shape
CPU times: user 25.8 s, sys: 19.8 s, total: 45.6 s
Wall time: 45.6 s
In [23]:
%%time
from datetime import datetime, date
# The target is binary
market_train = market_train.loc[market_train['time_x']>=date(2009, 1, 1)]
up = market_train.returnsOpenNextMktres10 >= 0
fcol = [c for c in market_train if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'audiences', 
                                             'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider', 
                                             'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe','sourceTimestamp']]
CPU times: user 208 ms, sys: 164 ms, total: 372 ms
Wall time: 371 ms
In [24]:
%%time
# We still need the returns for model tuning
X = market_train[fcol].values
up = up.values
r = market_train.returnsOpenNextMktres10.values

# Scaling of X values
# It is good to keep these scaling values for later
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
rng = maxs - mins
X = 1 - ((maxs - X) / rng)

# Sanity check
assert X.shape[0] == up.shape[0] == r.shape[0]
CPU times: user 356 ms, sys: 500 ms, total: 856 ms
Wall time: 850 ms

4.Model Trainingยถ

In [25]:
%%time
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import time

X_train, X_test, up_train, up_test, r_train, r_test = model_selection.train_test_split(X, up, r, test_size=0.25, random_state=99)
CPU times: user 496 ms, sys: 336 ms, total: 832 ms
Wall time: 1 s
In [26]:
xgb_up = XGBClassifier(n_jobs=4,n_estimators=250,max_depth=8,eta=0.1)
In [27]:
t = time.time()
print('Fitting Up')
xgb_up.fit(X_train,up_train)
print(f'Done, time = {time.time() - t}')
Fitting Up
Done, time = 345.907133102417
In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(xgb_up.predict(X_test),up_test)
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning:

The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.

Out[28]:
0.5517993024730501

5.Final Submissionยถ

Feature Gain & Splitยถ

Code
In [29]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame({'imp': xgb_up.feature_importances_, 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])
# _ = df.plot(kind='barh', x='col', y='imp', figsize=(7,12))


#plt.savefig('lgb_gain.png')
trace = go.Table(
    header=dict(values=list(df.columns),
                fill = dict(color='rgba(55, 128, 191, 0.7)'),
                align = ['left'] * 5),
    cells=dict(values=[df.imp,df.col],
               fill = dict(color='rgba(245, 246, 249, 1)'),
               align = ['left'] * 5))

data = [trace] 
py.iplot(data, filename = 'pandas_table')
0.0065478910692036150.0077661033719778060.0083752097561955450.0088015841320157050.010293893516063690.0103548038750886920.0116643821820616720.0143139939755201340.0151971979066729550.0167199634015560150.0197654943913221360.020161412656307220.0203441455960273740.0208923406898975370.0212578047066926960.02144053578376770.0243337899446487430.0285975337028503420.0286279879510402680.0292066391557455060.032708998769521710.032800365239381790.035236790776252750.036911830306053160.037460029125213620.0380082242190837860.0401400960981845860.0405055582523345950.04074919968843460.041419215500354770.043155170977115630.043155170977115630.043185625225305560.04543931782245636impnoveltyCount24HnoveltyCount5DurgencynoveltyCount3DnoveltyCount7DsentimentClassnoveltyCount12HtakeSequencevolumeCounts12HvolumeCounts24HvolumeCounts3DfirstMentionSentencecompanyCountrelevancewordCountvolumeCounts5DsentenceCountbodySizeopenvolumeCounts7Dtime_ysentimentPositivesentimentWordCountsentimentNeutralreturnsOpenPrevMktres1sentimentNegativereturnsClosePrevRaw10returnsClosePrevRaw1returnsClosePrevMktres1returnsClosePrevMktres10returnsOpenPrevRaw1returnsOpenPrevMktres10returnsOpenPrevRaw10closecolExport to plot.ly ยป
Code
In [30]:
data = [df]
for dd in data:  
    colors = []
    for i in range(len(dd)):
         colors.append(generate_color())

    data = [
        go.Bar(
        orientation = 'h',
        x=dd.imp,
        y=dd.col,
        name='Features',
        textfont=dict(size=20),
            marker=dict(
            color= colors,
            line=dict(
                color='#000000',
                width=0.5
            ),
            opacity = 0.87
        )
    )
    ]
    layout= go.Layout(
        title= 'Feature Importance of XGBOOST',
        xaxis= dict(title='Columns', ticklen=5, zeroline=True, gridwidth=2),
        yaxis=dict(title='Value Count', ticklen=5, gridwidth=2),
        showlegend=True
    )

    py.iplot(dict(data=data,layout=layout), filename='horizontal-bar')
00.010.020.030.040.05noveltyCount24HurgencynoveltyCount7DnoveltyCount12HvolumeCounts12HvolumeCounts3DcompanyCountwordCountsentenceCountopentime_ysentimentWordCountreturnsOpenPrevMktres1returnsClosePrevRaw10returnsClosePrevMktres1returnsOpenPrevRaw1returnsOpenPrevRaw10volumeExport to plot.ly ยป
FeaturesFeature Importance of XGBOOSTColumnsValue Count
Code
In [31]:
days = env.get_prediction_days()
Output
Code
In [32]:
import time

n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    print(n_days,end=' ')
    t = time.time()
    market_obs_df = data_prep(market_obs_df, news_obs_df)
    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    X_live = market_obs_df[fcol].values
    X_live = 1 - ((maxs - X_live) / rng)
    prep_time += time.time() - t
    
    t = time.time()
    lp = xgb_up.predict_proba(X_live)
    prediction_time += time.time() -t
    
    t = time.time()
    confidence = 2* lp[:,1] -1
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t
    
env.write_submission_file()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 Your submission file has been saved. Once you `Commit` your Kernel and it finishes running, you can submit the file to the competition from the Kernel Viewer `Output` tab.
Output
Code
In [33]:
sub  = pd.read_csv("submission.csv")
sub.head()
Out[33]:
time assetCode confidenceValue
0 2017-01-03 A.N 0.119038
1 2017-01-03 AA.N 0.000000
2 2017-01-03 AAL.O 0.000000
3 2017-01-03 AAN.N 0.000000
4 2017-01-03 AAP.N 0.000000
In [34]:
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import plot_importance
plt.figure(num=None, figsize=(15, 10), dpi=80, facecolor='w', edgecolor='k')
plt.bar(range(len(xgb_up.feature_importances_)), xgb_up.feature_importances_)
plt.xticks(range(len(xgb_up.feature_importances_)), fcol, rotation='vertical');
In [ ]: